Išanalizuoti LOTR knygą naudojant Python NLP metodus: teksto skaidymą ir sentimentų analizę. Naudojami moduliai: NLTK, TextBlob, VADER, Pandas, Matplotlib,
LOTR yra archetipinė maginės fantastikos žanro knyga pasižyminti mitologiniu naratyvumu ir iš tokios struktūros plaukiančia fabula: Iššūkis->sunkumai->laiminga pabaiga; gėrio blogio priešprieša; pagrindiniai veikėjai nemiršta.
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
#Patikriname, ar suveikė gražindami pirmuosius 5 sakinius
for i in range(5):
print(sentence[i] +'\n\n')
CHAPTER 1 A LONGEXPECTED PARTY When Mr. Bilbo Baggins of Bag End announced that he would shortly be celebrating his eleventyfirst birthday with a party of special magnificence there was much talk and excitement in Hobbiton. Bilbo was very rich and very peculiar and had been the wonder of the Shire for sixty years ever since his remarkable disappearance and unexpected return. The riches he had brought back from his travels had now become a local legend and it was popularly believed whatever the old folk might say that the Hill at Bag End was full of tunnels stuffed with treasure. And if that was not enough for fame there was also his prolonged vigour to marvel at. Time wore on but it seemed to have little effect on Mr. Baggins.
from nltk.probability import FreqDist
from nltk.corpus import stopwords
#Atlikęs tolimesnę filtraciją, radau daugiau žodžių, kurie trukdo analizei, todėl tų žodžių pašalinimą atlieku čia,
# kol dar dibu su listu, nes paprasčiau nei su dictionary :>
stop_words_remove_filtered = []
for e in stop_words_remove:
if e not in ('–','A','Shire','Mr','Well','Road','Company','River','Rivendell','Riders','Moria','Frodos','Do'
'So','Come','Bree','Suddenly','Baggins','Great','Let','Forest','Ill','Bag','Black','Lady','Old','Mountains','Lord',
'find','Men','Yes','Enemy','Mordor','Yet','Bilbos','How'):
stop_words_remove_filtered.append(e)
from nltk.tag import pos_tag
#Part Of Speech Tagger - gramatinės grupės priskyrimas
#POS Tag List: https://pythonprogramming.net/part-of-speech-tagging-nltk-tutorial/
#Ieškosime 'NNP' - Proper Noun
#Aragorn ir Strider yra tas pats veikėjas, todėl jų reikšmes reikia sudėti (200 + 198 = 398), o Strider pašalinti
vardai_filtered['Aragorn'] = 398
vardai_filtered.pop('Strider', None)
# pip install requests
import requests
from io import BytesIO
from matplotlib.offsetbox import OffsetImage,AnnotationBbox
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
def get_flag(name): #funkcija, kuri paima failą iš direktorijos, sukuria vaizdų masyvą
path = "C:\\Users\\andri\\Desktop\\path\\{}.png".format(name.title())
im = plt.imread(path)
return im
def offset_image(coord, name, ax): #funkcija, kuri tą failą patalpina diagramoje, pagal stulpelio pavadinimą
img = get_flag(name)
im = OffsetImage(img, zoom=0.1)
im.image.axes = ax
ab = AnnotationBbox(im, (coord, 50), xybox=(0., -16.), frameon=False,
xycoords='data', boxcoords="offset points", pad=0)
ax.add_artist(ab)
#duomenu rinkinys:
vardai = ["Frodo", "Gandalf", "Aragorn", "Sam", "Pippin", "Merry", "Boromir", "Elrond", "Gimli", "Legolas",
"Sauron", "Gollum", "Saruman"]
values = [1027, 461, 398, 396, 208, 182, 148, 120, 116, 94, 53, 52, 49]
fig, ax = plt.subplots(figsize = (20, 15), dpi = 200)
ax.bar(range(len(vardai)), values, width=0.6,align="center", color = 'red', edgecolor= 'black')
plt.title('Populiariausi Žiedų Valdovo Veikėjai', fontsize = 18)
plt.ylabel('Kiek kartų minimi knygoje', fontsize=16)
ax.set_xticks(range(len(vardai)))
ax.set_xticklabels(vardai)
ax.tick_params(axis='x', which='major', pad=26)
#ciklas, kuris is duomenu rinkinio paima vardą ir iš pirmosios funkcijos ištraukia failo pavadinimą tuo pačiu vardu
#ir patalpina į {}
for i, c in enumerate(vardai):
offset_image(i, c, ax)
# pip install textblob
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
# Sukuriamas ciklas, kuris patikrina kiekvieno skyriaus nuotaika ir rezultatus įdeda į žodyną.
chapter_sentiment = {}
chapter = 0
for c in chapters:
chapter_sentiment[str(chapter)]= TextBlob(c).polarity
chapter+=1
print(chapter_sentiment)
{'0': 0.0, '1': 0.10583779056408839, '2': 0.07492613298456124, '3': 0.07772520555627009, '4': 0.0519428738142611, '5': 0.05064237409120138, '6': 0.032974471143347964, '7': 0.07672902589415508, '8': 0.01985750037071965, '9': 0.08318650056307157, '10': 0.02546720214190095, '11': 0.018344421164705853, '12': 0.012253504103822569, '13': 0.11429061991984875, '14': 0.07511622657141744, '15': 0.044020577300209036, '16': 0.01688974056402187, '17': 0.032627172413356643, '18': 0.0834518725778889, '19': 0.0717503041362531, '20': 0.07544858796492791, '21': 0.03488606837067131, '22': 0.025343283224345623}
import numpy as np
import pandas as pd
df1 = pd.DataFrame.from_dict(chapter_sentiment, orient= 'index').iloc[1:23]
df1.plot.line(figsize = (22,10), color = 'purple')
plt.legend(['Kebabai'],fontsize = 15)
plt.xlabel('Skyrius', fontsize=15)
csfont = {'fontname':'Comic Sans MS'}
plt.title('Knygos skyrių nuotaikos', fontsize=22, **csfont)
plt.show()
# pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
obj = SentimentIntensityAnalyzer()
#funkcija, kuri grąžina pasirinktą skyrių skaičių (neprireike)
def ch1(number):
for i in range(number):
print(chapters[i])
#skyrių nuotaikų analizė naudojant VADER
chapter_sentiment_vader = {}
chapter = 0
for c in chapters:
chapter_sentiment_vader[str(chapter)]= obj.polarity_scores(c)
chapter+=1
print(chapter_sentiment_vader)
{'0': {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}, '1': {'neg': 0.059, 'neu': 0.808, 'pos': 0.133, 'compound': 1.0}, '2': {'neg': 0.098, 'neu': 0.773, 'pos': 0.128, 'compound': 0.9999}, '3': {'neg': 0.056, 'neu': 0.841, 'pos': 0.104, 'compound': 1.0}, '4': {'neg': 0.076, 'neu': 0.832, 'pos': 0.092, 'compound': 0.9991}, '5': {'neg': 0.069, 'neu': 0.809, 'pos': 0.122, 'compound': 0.9999}, '6': {'neg': 0.068, 'neu': 0.83, 'pos': 0.102, 'compound': 0.9998}, '7': {'neg': 0.058, 'neu': 0.811, 'pos': 0.131, 'compound': 0.9999}, '8': {'neg': 0.077, 'neu': 0.803, 'pos': 0.12, 'compound': 0.9999}, '9': {'neg': 0.053, 'neu': 0.845, 'pos': 0.101, 'compound': 0.9999}, '10': {'neg': 0.088, 'neu': 0.796, 'pos': 0.116, 'compound': 0.9997}, '11': {'neg': 0.087, 'neu': 0.824, 'pos': 0.089, 'compound': 0.9844}, '12': {'neg': 0.101, 'neu': 0.816, 'pos': 0.083, 'compound': -0.9995}, '13': {'neg': 0.062, 'neu': 0.805, 'pos': 0.133, 'compound': 1.0}, '14': {'neg': 0.099, 'neu': 0.792, 'pos': 0.109, 'compound': 0.9998}, '15': {'neg': 0.081, 'neu': 0.826, 'pos': 0.093, 'compound': 0.9996}, '16': {'neg': 0.086, 'neu': 0.833, 'pos': 0.081, 'compound': -0.9951}, '17': {'neg': 0.121, 'neu': 0.806, 'pos': 0.073, 'compound': -0.9998}, '18': {'neg': 0.065, 'neu': 0.836, 'pos': 0.099, 'compound': 0.9999}, '19': {'neg': 0.082, 'neu': 0.809, 'pos': 0.109, 'compound': 0.9997}, '20': {'neg': 0.061, 'neu': 0.823, 'pos': 0.115, 'compound': 0.9999}, '21': {'neg': 0.079, 'neu': 0.841, 'pos': 0.08, 'compound': 0.9865}, '22': {'neg': 0.123, 'neu': 0.77, 'pos': 0.107, 'compound': -0.9985}}
#DataFrame ir papildomų stulpelių sukūrimas
df2 = pd.DataFrame.from_dict(chapter_sentiment_vader, orient= 'index').iloc[1:23]
df2['Chapters'] = ['A Long-expected Party','The Shadow of the Past','Three is Company','A Short Cut to Mushrooms',
'A Conspiracy Unmasked','The Old Forest','In the House of Tom Bombadil','Fog on the Barrow-Downs','At the Sign of The Prancing Pony',
'Strider','A Knife in the Dark','Flight to the Ford','Many Meetings','The Council of Elrond','The Ring Goes South',
'A Journey in the Dark','The Bridge of Khazad-dum','Lothlorien','The Mirror of Galadriel','Farewell to Lorien',
'The Great River','The Breaking of the Fellowship']
df2['Rating'] = ['Positive','Positive','Positive','Positive','Positive','Positive','Positive','Positive','Positive',
'Positive','Positive', 'Negative','Positive','Positive','Positive','Negative','Negative','Positive','Positive',
'Positive','Positive','Negative']
df2
neg | neu | pos | compound | Chapters | Rating | |
---|---|---|---|---|---|---|
1 | 0.059 | 0.808 | 0.133 | 1.0000 | A Long-expected Party | Positive |
2 | 0.098 | 0.773 | 0.128 | 0.9999 | The Shadow of the Past | Positive |
3 | 0.056 | 0.841 | 0.104 | 1.0000 | Three is Company | Positive |
4 | 0.076 | 0.832 | 0.092 | 0.9991 | A Short Cut to Mushrooms | Positive |
5 | 0.069 | 0.809 | 0.122 | 0.9999 | A Conspiracy Unmasked | Positive |
6 | 0.068 | 0.830 | 0.102 | 0.9998 | The Old Forest | Positive |
7 | 0.058 | 0.811 | 0.131 | 0.9999 | In the House of Tom Bombadil | Positive |
8 | 0.077 | 0.803 | 0.120 | 0.9999 | Fog on the Barrow-Downs | Positive |
9 | 0.053 | 0.845 | 0.101 | 0.9999 | At the Sign of The Prancing Pony | Positive |
10 | 0.088 | 0.796 | 0.116 | 0.9997 | Strider | Positive |
11 | 0.087 | 0.824 | 0.089 | 0.9844 | A Knife in the Dark | Positive |
12 | 0.101 | 0.816 | 0.083 | -0.9995 | Flight to the Ford | Negative |
13 | 0.062 | 0.805 | 0.133 | 1.0000 | Many Meetings | Positive |
14 | 0.099 | 0.792 | 0.109 | 0.9998 | The Council of Elrond | Positive |
15 | 0.081 | 0.826 | 0.093 | 0.9996 | The Ring Goes South | Positive |
16 | 0.086 | 0.833 | 0.081 | -0.9951 | A Journey in the Dark | Negative |
17 | 0.121 | 0.806 | 0.073 | -0.9998 | The Bridge of Khazad-dum | Negative |
18 | 0.065 | 0.836 | 0.099 | 0.9999 | Lothlorien | Positive |
19 | 0.082 | 0.809 | 0.109 | 0.9997 | The Mirror of Galadriel | Positive |
20 | 0.061 | 0.823 | 0.115 | 0.9999 | Farewell to Lorien | Positive |
21 | 0.079 | 0.841 | 0.080 | 0.9865 | The Great River | Positive |
22 | 0.123 | 0.770 | 0.107 | -0.9985 | The Breaking of the Fellowship | Negative |
#Grafikas
import matplotlib.patches as mpatches
fig, ax = plt.subplots(figsize=(15, 18))
ax.set_alpha(0.5)
ax.barh(df2['Chapters'], df2['compound'],
color=df2.Rating.map({'Positive': 'g', 'Negative': 'r', 'Neutral': 'orange'}),
alpha=.5)
ax.set_title("Nuotaikos pagal skyrių naudojant 'Vader' modulį",fontsize=33)
ax.set_xlabel("Bendras įvvertinimas (Range= -1.0 - 1.0)", fontsize=27)
ax.set_ylabel("Skyriai", fontsize=27)
ax.set_yticklabels(df2.Chapters, rotation=0, fontsize=22)
ax.legend(handles=all_handles,loc='upper left', fontsize=20)
ax.tick_params(axis='x', which='major', labelsize=18)
ax.invert_yaxis()
plt.show()
C:\Users\andri\AppData\Local\Temp\ipykernel_5168\1672071153.py:12: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_yticklabels(df2.Chapters, rotation=0, fontsize=22)
#Sukuriama funkcija, kuri paima veikėjo vardą ir sakinys by sakinys patikrina kokia to sakinio nuotaika, kuriame yra
# yra veikėjas ir susumuoja visas reikšmes, ir jas prideda konkrečiam vardui.
def veikejas_sakinys(veikejo_vardas, sakiniu_sarasas):
ivertinimas = 0
for st in sentence:
bolb = TextBlob(st)
if veikejo_vardas in st and blob.polarity:
ivertinimas = ivertinimas + blob.polarity
return ivertinimas
# Funkcijos patikrinimas:
print(veikejas_sakinys('Sam', sentence))
24.201983530453123
#Sutvarkome Aragorną
veikejas_raktas['Aragorn'] = 24.3812574825303
vardai_filtered.pop('Strider', None)
#Pasitikriname ar gerai išsiskaidė
for z in range(10):
print(paragraph[z])
print('\n------------------\n')
CHAPTER 1 ------------------ A LONGEXPECTED PARTY ------------------ When Mr Bilbo Baggins of Bag End announced that he would shortly be celebrating his eleventyfirst birthday with a party of special magnificence there was much talk and excitement in Hobbiton Bilbo was very rich and very peculiar and had been the wonder of the Shire for sixty years ever since his remarkable disappearance and unexpected return The riches he had brought back from his travels had now become a local legend and it was popularly believed whatever the old folk might say that the Hill at Bag End was full of tunnels stuffed with treasure And if that was not enough for fame there was also his prolonged vigour to marvel at Time wore on but it seemed to have little effect on Mr Baggins At ninety he was much the same as at fifty At ninetynine they began to call him wellpreserved but unchanged would have been nearer the mark There were some that shook their heads and thought this was too much of a good thing it seemed unfair that anyone should possess apparently perpetual youth as well as reputedly inexhaustible wealth It will have to be paid for they said It isnt natural and trouble will come of it But so far trouble had not come and as Mr Baggins was generous with his money most people were willing to forgive him his oddities and his good fortune He remained on visiting terms with his relatives except of course the SackvilleBagginses and he had many devoted admirers among the hobbits of poor and unimportant families But he had no close friends until some of his younger cousins began to grow up ------------------ 28 ------------------ the fellowship of the ring ------------------ The eldest of these and Bilbos favourite was young Frodo Baggins When Bilbo was ninetynine he adopted Frodo as his heir and brought him to live at Bag End and the hopes of the SackvilleBagginses were finally dashed Bilbo and Frodo happened to have the same birthday September 22nd You had better come and live here Frodo my lad said Bilbo one day and then we can celebrate our birthdayparties comfortably together At that time Frodo was still in his tweens as the hobbits called the irresponsible twenties between childhood and coming of age at thirtythree Twelve more years passed Each year the Bagginses had given very lively combined birthdayparties at Bag End but now it was understood that something quite exceptional was being planned for that autumn Bilbo was going to be eleventyone 111 a rather curious number and a very respectable age for a hobbit the Old Took himself had only reached 130 and Frodo was going to be thirtythree 33 an important number the date of his coming of age Tongues began to wag in Hobbiton and Bywater and rumour of the coming event travelled all over the Shire The history and character of Mr Bilbo Baggins became once again the chief topic of conversation and the older folk suddenly found their reminiscences in welcome demand No one had a more attentive audience than old Ham Gamgee commonly known as the Gaffer He held forth at The Ivy Bush a small inn on the Bywater road and he spoke with some authority for he had tended the garden at Bag End for forty years and had helped old Holman in the same job before that Now that he was himself growing old and stiff in the joints the job was mainly carried on by his youngest son Sam Gamgee Both father and son were on very friendly terms with Bilbo and Frodo They lived on the Hill itself in Number 3 Bagshot Row just below Bag End A very nice wellspoken gentlehobbit is Mr Bilbo as Ive always said the Gaffer declared With perfect truth for Bilbo was very polite to him calling him Master Hamfast and ------------------ a longexpected party ------------------ 29 ------------------ consulting him constantly upon the growing of vegetables – in the matter of roots especially potatoes the Gaffer was recognized as the leading authority by all in the neighbourhood including himself But what about this Frodo that lives with him asked Old Noakes of Bywater Baggins is his name but hes more than half a Brandybuck they say It beats me why any Baggins of Hobbiton should go looking for a wife away there in Buckland where folks are so queer And no wonder theyre queer put in Daddy Twofoot the Gaffers nextdoor neighbour if they live on the wrong side of the Brandywine River and right agin the Old Forest Thats a dark bad place if half the tales be true Youre right Dad said the Gaffer Not that the Brandybucks of Buckland live in the Old Forest but theyre a queer breed seemingly They fool about with boats on that big river – and that isnt natural Small wonder that trouble came of it I say But be that as it may Mr Frodo is as nice a young hobbit as you could wish to meet Very much like Mr Bilbo and in more than looks After all his father was a Baggins A decent respectable hobbit was Mr Drogo Baggins there was never much to tell of him till he was drownded Drownded said several voices They had heard this and other darker rumours before of course but hobbits have a passion for family history and they were ready to hear it again Well so they say said the Gaffer You see Mr Drogo he married poor Miss Primula Brandybuck She was our Mr Bilbos first cousin on the mothers side her mother being the youngest of the Old Tooks daughters and Mr Drogo was his second cousin So Mr Frodo is his first and second cousin once removed either way as the saying is if you follow me And Mr Drogo was staying at Brandy Hall with his fatherinlaw old Master Gorbadoc as he often did after his marriage him being partial to his vittles and old Gorbadoc keeping a mighty generous table and he went out ------------------ 30 ------------------
#Kartojame ta pačią funkciją ir ciklą, tik šiuo atveju, paragrafų sąrašui.
def veikejas_paragrafas(veikejo_vardas, paragrafu_sarasas):
ivertinimas = 0
for ph in paragraph:
bolb = TextBlob(ph)
if veikejo_vardas in ph and blob.polarity:
ivertinimas = ivertinimas + blob.polarity
return ivertinimas
print(veikejas_paragrafas('Strider', paragraph))
4.959912674142201
#Sutvarkome Aragorną
veikejas_raktas_paragrafas['Aragorn'] = 11.65280688503288
vardai_filtered.pop('Strider', None)
#sukuriame po DataFrame rezultatams iš sakinių ir paragrafų.
dfs = pd.DataFrame.from_dict(veikejas_raktas_sorted, orient = 'index')
dfs.head(40)
dfs.drop(['F','O','G','M','Me','Go','yon','Ho','Be','Tom','Hi','Ride','Rider','Hi','Is','Or',
'Brand','Mount','Her','Mountain','Nor','West','Brandy','fill','North','Elf','Lor','Im','Hal',
'Sun','Dark','South','My','Their','Man','Dont','Too','Buck','Master','field','Sea','Far','Good','Orc','Though','Ever',
'Gate','Elvish','Moon','flame','Minas','Hobbiton' ,'Hill' ,'Ive' ,'Your' ,'Down' ,'Have' ,'May' ,'Nine' ,'Dwarves',
'Soon' ,'Loth' ,'Elven' ,'Many' ,'Mirkwood' ,'Bom' ,'Tirith' ,'Galadriel' ,'Brandybuck' ,'From' ,'Council' ,'Bombadil',
'Lothlórien','CHAPTER' ,'Ah' ,'Three' ,'fit' ,'Shadow' ,'Gold' ,'Bill' ,'Gondor' ,'Middle' ,'Buckland' ,'Day' ,'Just',
'Middleearth', 'White' ,'Wise','Lórien', 'East'], axis = 0, inplace=True)
dfs.head(15)
0 | |
---|---|
Frodo | 64.837413 |
Gandalf | 27.488673 |
Aragorn | 24.381257 |
Sam | 24.201984 |
Pippin | 12.549177 |
Merry | 11.294259 |
Boromir | 9.262488 |
Elrond | 7.888054 |
Gimli | 7.170958 |
Legolas | 5.677008 |
Sauron | 3.406205 |
Gollum | 3.346447 |
Butterbur | 2.987899 |
Saruman | 2.868383 |
Haldir | 2.569593 |
#paragrafo df
dfh = pd.DataFrame.from_dict(veikejas_raktas_paragrafas_sorted, orient = 'index')
dfh.drop(['F','O','G','M','Me','Go','yon','Ho','Be','Tom','Hi','Ride','Rider','Hi','Is','Or',
'Brand','Mount','Her','Mountain','Nor','West','Brandy','fill','North','Elf','Lor','Im','Hal',
'Sun','Dark','South','My','Their','Man','Dont','Too','Buck','Master','field','Sea','Far','Good','Orc','Though','Ever',
'Gate','Elvish','Moon','flame','Minas','Hobbiton' ,'Hill' ,'Ive' ,'Your' ,'Down' ,'Have' ,'May' ,'Nine' ,'Dwarves',
'Soon' ,'Loth' ,'Elven' ,'Many' ,'Mirkwood' ,'Bom' ,'Tirith' ,'Galadriel' ,'Brandybuck' ,'From' ,'Council' ,'Bombadil',
'Lothlórien','CHAPTER' ,'Ah' ,'Three' ,'fit' ,'Shadow' ,'Gold' ,'Bill' ,'Gondor' ,'Middle' ,'Buckland' ,'Day' ,'Just',
'Middleearth', 'White' ,'Wise','Lórien', 'East' ], axis = 0, inplace=True)
dfh.head(15)
0 | |
---|---|
Frodo | 22.409244 |
Gandalf | 12.788209 |
Aragorn | 11.652807 |
Sam | 11.473533 |
Pippin | 7.828296 |
Merry | 6.931926 |
Gimli | 4.780639 |
Boromir | 4.720881 |
Elrond | 4.302575 |
Legolas | 3.704995 |
Sauron | 2.390319 |
Gollum | 1.673224 |
Butterbur | 1.553708 |
Saruman | 1.254918 |
Isildur | 1.254918 |
#palyginame
pd.concat([dfs, dfh], axis=1).head(15)
0 | 0 | |
---|---|---|
Frodo | 64.837413 | 22.409244 |
Gandalf | 27.488673 | 12.788209 |
Aragorn | 24.381257 | 11.652807 |
Sam | 24.201984 | 11.473533 |
Pippin | 12.549177 | 7.828296 |
Merry | 11.294259 | 6.931926 |
Boromir | 9.262488 | 4.720881 |
Elrond | 7.888054 | 4.302575 |
Gimli | 7.170958 | 4.780639 |
Legolas | 5.677008 | 3.704995 |
Sauron | 3.406205 | 2.390319 |
Gollum | 3.346447 | 1.673224 |
Butterbur | 2.987899 | 1.553708 |
Saruman | 2.868383 | 1.254918 |
Haldir | 2.569593 | 1.254918 |
#exportuojame į Tableau
dfs.to_csv('Vardai_sakiniai.csv')
dfh.to_csv('Vardai_paragrafai.csv')
** Taip pat, labai įdomu tai, jog blogiausi veikėjai, šioje analizėje, nėra blogiečiai pagal nutylėjimą. Isilduras - veikėjęs dėl kurio ir prasidėjo visa knyga. Turėdamas galimybę sunaikinti blogį, jis pasidavė pagundoms, išdavė gėrį ir užvirė visą peklą, kuri dabar sprendžiama per 3 knygos dalis.
In case you were wondering... this came from here: